# data read-in
train_x <- read.csv('MNISTTrainXV2.csv')
train_y <- read.csv('MNISTTrainY.csv')
test_x <- read.csv('MNISTTestXRand.csv')
test_y <- read.csv("MNISTTestYRand.csv")
validation_x <- read.csv('MNISTValidationX.csv')
validation_y <- read.csv('MNISTValidationY.csv')
# plot function from instruction
plot_digit <- function(x, bw = FALSE,...) {
  if(sqrt(length(x)) != round(sqrt(length(x)))) {
    stop(print("Not a square image! Something is wrong here."))
  }
  n <- sqrt(length(x))
  if(bw == TRUE) {
    x <- as.numeric(x > 50)*256
  }
  par(pty = "s")
  image(matrix(as.matrix(x), nrow = n)[,n:1], col = gray(12:1 / 12), ...)
}
#Example
plot_digit(x = train_x[1,], bw = FALSE, main = "True Class = 0") #x vector length = square of integer

# bw= TRUE: round up pixel to black or white
plot_digit(x = train_x[1,], bw = TRUE, main = "True Class = 0")

Collaborators: Billy Ge, Latifa Tan, Annie Luo

Question 1 (15 pts.)

Part 1 Solution

train_y[50,] #image label 0
## [1] 0
plot_digit(x = train_x[50,], bw = FALSE, main = "True Class = 0")

train_y[1500,] #0
## [1] 0
plot_digit(x = train_x[1500,], bw = FALSE, main = "True Class = 0")

train_y[2800,] #1
## [1] 1
plot_digit(x = train_x[2800,], bw = FALSE, main = "True Class = 1")

train_y[6500,] #2
## [1] 2
plot_digit(x = train_x[6500,], bw = FALSE, main = "True Class = 2")

train_y[10000,] #3
## [1] 3
plot_digit(x = train_x[10000,], bw = FALSE, main = "True Class = 3")

train_y[13780,] #5
## [1] 5
plot_digit(x = train_x[13780,], bw = FALSE, main = "True Class = 5")

train_y[17000,] #6
## [1] 6
plot_digit(x = train_x[17000,], bw = FALSE, main = "True Class = 6")

train_y[20000,] #7
## [1] 7
plot_digit(x = train_x[20000,], bw = FALSE, main = "True Class = 7")

train_y[23730,] #9
## [1] 9
plot_digit(x = train_x[23730,], bw = FALSE, main = "True Class = 9")

Yes the images match the labels.

Part 2 Solution

mean_train <- matrix(nrow = 10, ncol = 144)
x_split <- seq(1,27500,2500)
for (i in 1:144){
  for (j in 1:10){
    mean_train[j,i] <- mean(train_x[x_split[j]:(x_split[j+1]-1),i])
  }
}
for (j in 1:10){
  plot_digit(x = mean_train[j,], bw = FALSE)
}

Class for label 4,5,6,8, and 9 seem to show the most within class variation over images; class for label 1 and 0 seem to show the least within class variation over images.

Part 3 Solution

# function to identify similarity using euclidean
# input: sample observation[one row/list] with 144 vectors 
# output: a table with the similarity between THE observation and [0-9]
ob<-train_x[1,]
sim_check<-function(ob,source){
  # mean_train
  df <- data.frame(expand.grid(0:9, 1), stringsAsFactors = F)
  names(df)=c("integer","similarity")
  for (i in 1:10){
    temp_sum<-0
    check_num<-mean_train[i,]
    for (j in 1:144){
      check_num<-mean_train[i,]
      if (source=="trainSet"){
        intOB<-ob[j][,1]
      }
      if (source=="avgSet"){
        intOB<-ob[j]
      }
      intCheck<-check_num[j]
      temp_sum<-temp_sum+(intOB-intCheck)^2
    }
    df[i,2]<-temp_sum
  }
  return(df)
}
# example: when we input the first observation in the dataset(from class==0), we could get its similarity table with each integer(base on the average from last question)
observation_similarity<-sim_check(ob,"trainSet")
observation_similarity
# if it make sense: find the most similar distinct number for each integer
#   e.g. find integer among[1-9] that is most similar with 0
avg_similarity<- data.frame(expand.grid(0:9, 1), stringsAsFactors = F)
names(avg_similarity)=c("integer","top_similar_num")
for (i in 1:10){
  check_int<-mean_train[i,]
  temp_df<-sim_check(check_int,"avgSet")
  temp_df<-temp_df[-c(i),]
  avg_similarity[i,2]<-temp_df[which.min(temp_df$similarity),1]
}
avg_similarity

We use euclidean distance between each obs. to means of each classes as our measure of similarity. Since there are certain classes that have large variation within the class (4,5,6,8,9), the classes with large variances are going to be difficult to tell apart. The zero-one pair will be easy since those classes have low variances.

Question 2 (15 pts.)

Question 2 Data

#subset data to only include 0s and 1s
p2_trainX <- train_x[1:5000,]
p2_trainY <- train_y[1:5000,]
p2_validX <- validation_x[1:3000,]
p2_validY <- validation_y[1:3000,]

Part 1 Solution

#logistic regression model
p2_train <- data.frame(p2_trainX, p2_trainY)
p2p1_logistic <- glm(data=p2_train, 
                     formula = p2_trainY ~ .,
                     family = binomial(link='logit'),
                     control = list(maxit = 100))
#in-sample EPE
folds <- create_folds(seq(1,nrow(p2_train)), k = 10, type = "basic")
mis_rate <- c()
for(i in 1:length(folds)){
  countMs = 0
  logistic_cv <- glm(p2_trainY ~ ., data=p2_train[folds[[i]],], trace=FALSE)
  logistic_cv_pred <- predict(logistic_cv, newdata = p2_train[-folds[[i]],], type = "response")
  #classify predictions
  for (j in 1:length(logistic_cv_pred)){
    if (logistic_cv_pred[j] > 0.5) {
      logistic_cv_pred[j] <- 1
    }
    else {
      logistic_cv_pred[j] <- 0
    }
  }
  #Count of Misclassifications
  for (k in 1:length(logistic_cv_pred)) {
    if (logistic_cv_pred[k] != p2_train[-folds[[i]],]$p2_trainY[k]){
      countMs = countMs + 1
    }
  }
  mis_rate[i] <- countMs / length(logistic_cv_pred)
}
p2p1_epe<-mean(mis_rate)
p2p1_epe #0.0036
## [1] 0.0032
#oos
countMs = 0
logistic_valid <- predict(p2p1_logistic, newdata = p2_validX, type = "response")
#classify predictions
for (j in 1:length(logistic_valid)){
  if (logistic_valid[j] > 0.5) {
    logistic_valid[j] <- 1
  }
  else {
    logistic_valid[j] <- 0
  }
}
#Count of Misclassifications
for (k in 1:length(logistic_valid)) {
  if (logistic_valid[k] != p2_validY[k]){
    #print(k) misclassified: 271, 1057, 2203, 2259, 2311, 2555
    countMs = countMs + 1
  }
}
p2p1_oos <- countMs / length(logistic_valid)
p2p1_oos #0.002
## [1] 0.002
zeroOne_result <- data.frame("In Sample" = p2p1_epe, "OOS" = p2p1_oos)
zeroOne_result

The in-sample EPE and oos PE are both extremely small. Our model has great predictive capabilities. It’s easy to find separators between 0 and 1, which makes LR shine, because LR draws simple linear boundaries and thus has high precision. Logistic regression calculates the probability of digit 1 given all the pixel values of each image. Since 0 and 1 are very different images visually, the logistic regression is capable of classifying them easily.

countMs #there are 6 misclassifications: 271, 1057, 2203, 2259, 2311, 2555
## [1] 6
validation_y[271,] #0
## [1] 0
plot_digit(x = validation_x[271,], bw = FALSE, main = "True Class = 0")

validation_y[1057,] #0
## [1] 0
plot_digit(x = validation_x[1057,], bw = FALSE, main = "True Class = 0")

validation_y[2203,] #1
## [1] 1
plot_digit(x = validation_x[2203,], bw = FALSE, main = "True Class = 1")

validation_y[2311,] #1
## [1] 1
plot_digit(x = validation_x[2311,], bw = FALSE, main = "True Class = 1")

The misclassification does make sense, because the ones have a lot of white area around the middle vertical line (when x is 0.4-0.6), and zeros have a lot of gray area around the middle vertical line.

Part 2 Solution

#LASSO
p2p2_trainX <- data.matrix(p2_trainX)
p2p2_trainY <- data.matrix(p2_trainY)
#scientific lambda
p2_cv_lasso <- cv.glmnet(x=p2p2_trainX, y=p2p2_trainY, family="binomial",
                     control = list(maxit = 100), alpha=1, nfolds=10,
                          type.measure="mse")
#our chosen value of lambda is
p2_cv_lasso$lambda.min
## [1] 0.0001582495
p2_lasso <- glmnet(x=p2p2_trainX, y=p2p2_trainY, family="binomial", alpha=1,
                            lambda=p2_cv_lasso$lambda.min)
p2_lasso$beta #coefficients
## 144 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## Row1Col1    .           
## Row1Col2    .           
## Row1Col3    0.2152761445
## Row1Col4    .           
## Row1Col5    .           
## Row1Col6    .           
## Row1Col7    .           
## Row1Col8    0.0173377595
## Row1Col9    .           
## Row1Col10   .           
## Row1Col11   .           
## Row1Col12   .           
## Row2Col1    .           
## Row2Col2    .           
## Row2Col3    .           
## Row2Col4    .           
## Row2Col5    .           
## Row2Col6    .           
## Row2Col7    .           
## Row2Col8    .           
## Row2Col9    .           
## Row2Col10   .           
## Row2Col11   0.0013917526
## Row2Col12   .           
## Row3Col1    .           
## Row3Col2    .           
## Row3Col3    .           
## Row3Col4    .           
## Row3Col5   -0.0028891515
## Row3Col6    .           
## Row3Col7    .           
## Row3Col8   -0.0014196955
## Row3Col9    .           
## Row3Col10   .           
## Row3Col11   .           
## Row3Col12  -0.0063710099
## Row4Col1    .           
## Row4Col2    .           
## Row4Col3    .           
## Row4Col4    .           
## Row4Col5    .           
## Row4Col6    .           
## Row4Col7    .           
## Row4Col8    .           
## Row4Col9    .           
## Row4Col10  -0.0007511194
## Row4Col11   .           
## Row4Col12   .           
## Row5Col1    .           
## Row5Col2    .           
## Row5Col3   -0.0012242005
## Row5Col4   -0.0061211032
## Row5Col5    .           
## Row5Col6    .           
## Row5Col7    .           
## Row5Col8    .           
## Row5Col9   -0.0021530844
## Row5Col10  -0.0039942347
## Row5Col11  -0.0029983281
## Row5Col12   .           
## Row6Col1   -0.0289702445
## Row6Col2    .           
## Row6Col3   -0.0035376018
## Row6Col4   -0.0025680243
## Row6Col5    .           
## Row6Col6    .           
## Row6Col7    0.0092705194
## Row6Col8    .           
## Row6Col9   -0.0071890080
## Row6Col10  -0.0134948085
## Row6Col11  -0.0200574625
## Row6Col12  -0.0089503399
## Row7Col1    .           
## Row7Col2    .           
## Row7Col3   -0.0012166434
## Row7Col4    .           
## Row7Col5   -0.0077516568
## Row7Col6    0.0083734506
## Row7Col7    0.0203533782
## Row7Col8    .           
## Row7Col9   -0.0146666076
## Row7Col10  -0.0030162350
## Row7Col11   .           
## Row7Col12   .           
## Row8Col1    .           
## Row8Col2    .           
## Row8Col3   -0.0054909734
## Row8Col4   -0.0077316566
## Row8Col5   -0.0103708717
## Row8Col6    0.0020068307
## Row8Col7    .           
## Row8Col8   -0.0083740172
## Row8Col9   -0.0016438706
## Row8Col10  -0.0091810549
## Row8Col11   .           
## Row8Col12   .           
## Row9Col1    .           
## Row9Col2    .           
## Row9Col3    .           
## Row9Col4    .           
## Row9Col5    .           
## Row9Col6    .           
## Row9Col7    .           
## Row9Col8    .           
## Row9Col9    .           
## Row9Col10   .           
## Row9Col11   .           
## Row9Col12   .           
## Row10Col1   .           
## Row10Col2   .           
## Row10Col3   .           
## Row10Col4   .           
## Row10Col5  -0.0026402010
## Row10Col6   .           
## Row10Col7   .           
## Row10Col8   .           
## Row10Col9   .           
## Row10Col10  .           
## Row10Col11  .           
## Row10Col12  0.0225219021
## Row11Col1   .           
## Row11Col2   .           
## Row11Col3   .           
## Row11Col4   .           
## Row11Col5  -0.0024171682
## Row11Col6  -0.0049157171
## Row11Col7   .           
## Row11Col8   .           
## Row11Col9   .           
## Row11Col10  .           
## Row11Col11  .           
## Row11Col12  .           
## Row12Col1   .           
## Row12Col2   .           
## Row12Col3   0.0062603675
## Row12Col4   0.0011313093
## Row12Col5   .           
## Row12Col6   .           
## Row12Col7   .           
## Row12Col8   .           
## Row12Col9   .           
## Row12Col10  .           
## Row12Col11  .           
## Row12Col12  .
#correlations
p2_corr <- c()
for (i in 1:length(p2_lasso$beta)){
  if (p2_lasso$beta[i] > 0) {
    p2_corr[i] = 1
  }
  else if (p2_lasso$beta[i] < 0) {
    p2_corr[i] = -1
  }
  else {
    p2_corr[i] = 0
  }
}
cf1 <- colnames(p2_trainX)
names(p2_corr) <- cf1
matrix(nrow = 12, ncol = 12, p2_corr, byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
##  [1,]    0    0    1    0    0    0    0    1    0     0     0     0
##  [2,]    0    0    0    0    0    0    0    0    0     0     1     0
##  [3,]    0    0    0    0   -1    0    0   -1    0     0     0    -1
##  [4,]    0    0    0    0    0    0    0    0    0    -1     0     0
##  [5,]    0    0   -1   -1    0    0    0    0   -1    -1    -1     0
##  [6,]   -1    0   -1   -1    0    0    1    0   -1    -1    -1    -1
##  [7,]    0    0   -1    0   -1    1    1    0   -1    -1     0     0
##  [8,]    0    0   -1   -1   -1    1    0   -1   -1    -1     0     0
##  [9,]    0    0    0    0    0    0    0    0    0     0     0     0
## [10,]    0    0    0    0   -1    0    0    0    0     0     0     1
## [11,]    0    0    0    0   -1   -1    0    0    0     0     0     0
## [12,]    0    0    1    1    0    0    0    0    0     0     0     0

As shown in the above matrix, where 1 represents a positive correlation, 0 non-important correlation, and -1 a negative correlation, we can see that the center has a positive correlation, meaning that the center being filled with pencil mark signals the class of one. There’s also a 1 in [1,8], showing a pencil mark on the middle top signals the class of one. And there are -1 around the center, meaning marks there would decrease the probability of the number being classified as one, and therefore signals the class of zero.

Question 3 (25 pts.)

Question 3 Data

p3_trainX <- train_x[c(10001:12500, 22501:25000),]
p3_trainY <- train_y[c(10001:12500, 22501:25000),]
p3_validX <- validation_x[c(6001:7500, 13501:15000),]
p3_validY <- validation_y[c(6001:7500, 13501:15000),]

Part 1 Solution

#recode for logistic regression
length(p3_trainY)
## [1] 5000
p3p1_trainY <- matrix(nrow = 5000, ncol = 1)
p3p1_trainY[1:2500,] = 0 #4 -> 0
p3p1_trainY[2501:5000,] = 1 #9 -> 1
length(p3_validY)
## [1] 3000
p3p1_validY <- matrix(nrow = 3000, ncol = 1)
p3p1_validY[1:1500,] = 0 #4 -> 0
p3p1_validY[1501:3000,] = 1 #9 -> 1
#logistic regression model
p3_logTrain <- data.frame(p3_trainX, p3p1_trainY)
p3p1_logistic <- glm(data=p3_logTrain, 
                     formula = p3p1_trainY ~ .,
                     family = binomial(link='logit'),
                     control = list(maxit = 100))
#in-sample EPE
folds <- create_folds(seq(1,nrow(p3_logTrain)), k = 10, type = "basic")
mis_rate <- c()
for(i in 1:length(folds)){
  countMs = 0
  logistic_cv <- glm(p3p1_trainY ~ ., data=p3_logTrain[folds[[i]],], trace=FALSE)
  logistic_cv_pred <- predict(logistic_cv, newdata = p3_logTrain[-folds[[i]],], type = "response")
  #classify predictions
  for (j in 1:length(logistic_cv_pred)){
    if (logistic_cv_pred[j] > 0.5) {
      logistic_cv_pred[j] <- 1
    }
    else {
      logistic_cv_pred[j] <- 0
    }
  }
  #Count of Misclassifications
  for (k in 1:length(logistic_cv_pred)) {
    if (logistic_cv_pred[k] != p3_logTrain[-folds[[i]],]$p3p1_trainY[k]){
      countMs = countMs + 1
    }
  }
  mis_rate[i] <- countMs / length(logistic_cv_pred)
}
p3p1_epe<-mean(mis_rate)
p3p1_epe #approx. 0.0508; numerically unstable
## [1] 0.05
#oos
countMs = 0
logistic_valid <- predict(p3p1_logistic, newdata = p3_validX, type = "response")
#classify predictions
for (j in 1:length(logistic_valid)){
  if (logistic_valid[j] > 0.5) {
    logistic_valid[j] <- 1
  }
  else {
    logistic_valid[j] <- 0
  }
}
#Count of Misclassifications
for (k in 1:length(logistic_valid)) {
  if (logistic_valid[k] != p3p1_validY[k]){
    countMs = countMs + 1
  }
}
p3p1_oos <- countMs / length(logistic_valid)
p3p1_oos #0.035
## [1] 0.035
fourNine_result <- data.frame("In sample logistic"=p3p1_epe, 'OOS logistic'=p3p1_oos)
fourNine_result

The four-nine classification’s in-sample EPE is 0.0508, while oos EPE is 0.035. The EPE is ten times larger than zero-one classification. The results differ mainly because four and nine naturally look much similar than zero and one do.

#LASSO
p3_lTrainX <- data.matrix(p3_trainX)
p3_lTrainY <- data.matrix(p3_trainY)
#scientific lambda
p3_cv_lasso <- cv.glmnet(x=p3_lTrainX, y=p3_lTrainY, family="binomial",
                     control = list(maxit = 100), alpha=1, nfolds=10,
                          type.measure="mse")
#our chosen value of lambda is
p3_cv_lasso$lambda.min
## [1] 0.0008488916
p3_lasso <- glmnet(x=p3_lTrainX, y=p3_lTrainY, family="binomial", alpha=1,
                            lambda=p3_cv_lasso$lambda.min)
p3_lasso$beta #coefficients
## 144 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## Row1Col1   -1.665330e-02
## Row1Col2    9.147675e-03
## Row1Col3    .           
## Row1Col4    .           
## Row1Col5    1.384745e-03
## Row1Col6   -2.124713e-02
## Row1Col7    .           
## Row1Col8    .           
## Row1Col9    .           
## Row1Col10   .           
## Row1Col11  -1.985560e-02
## Row1Col12  -4.275375e-02
## Row2Col1    3.841653e-02
## Row2Col2    .           
## Row2Col3   -5.547256e-03
## Row2Col4   -4.005452e-03
## Row2Col5    .           
## Row2Col6    6.474270e-03
## Row2Col7    2.694117e-03
## Row2Col8   -3.053271e-03
## Row2Col9    3.868944e-03
## Row2Col10   .           
## Row2Col11   .           
## Row2Col12   .           
## Row3Col1   -5.934213e-02
## Row3Col2   -2.743025e-02
## Row3Col3   -3.166567e-03
## Row3Col4    .           
## Row3Col5    5.224555e-03
## Row3Col6    8.850511e-03
## Row3Col7    1.136328e-02
## Row3Col8    1.075646e-02
## Row3Col9    2.981617e-03
## Row3Col10   3.234512e-03
## Row3Col11  -6.841969e-03
## Row3Col12  -4.039753e-02
## Row4Col1   -3.307602e-03
## Row4Col2    .           
## Row4Col3    1.178490e-04
## Row4Col4    2.939138e-06
## Row4Col5    4.344254e-03
## Row4Col6    8.446253e-03
## Row4Col7    1.749658e-02
## Row4Col8    6.021416e-03
## Row4Col9    3.344846e-03
## Row4Col10   .           
## Row4Col11   6.557792e-03
## Row4Col12   .           
## Row5Col1    .           
## Row5Col2    1.028417e-02
## Row5Col3    3.090678e-03
## Row5Col4    .           
## Row5Col5   -8.802930e-04
## Row5Col6    1.152156e-03
## Row5Col7    2.775349e-03
## Row5Col8   -3.042050e-03
## Row5Col9    2.200537e-03
## Row5Col10   1.621384e-03
## Row5Col11   .           
## Row5Col12   1.582235e-02
## Row6Col1   -9.392727e-03
## Row6Col2    .           
## Row6Col3    .           
## Row6Col4   -7.550925e-04
## Row6Col5   -1.068524e-02
## Row6Col6   -1.173256e-02
## Row6Col7    4.178094e-03
## Row6Col8    1.586552e-03
## Row6Col9    3.351060e-03
## Row6Col10   .           
## Row6Col11  -1.948497e-03
## Row6Col12  -4.452960e-02
## Row7Col1    .           
## Row7Col2    .           
## Row7Col3   -6.844262e-03
## Row7Col4   -8.602384e-03
## Row7Col5   -4.486658e-03
## Row7Col6   -2.543229e-03
## Row7Col7   -4.628112e-03
## Row7Col8   -1.930305e-03
## Row7Col9   -2.132315e-03
## Row7Col10   .           
## Row7Col11  -1.019658e-02
## Row7Col12   .           
## Row8Col1   -3.424718e-02
## Row8Col2    .           
## Row8Col3   -2.965009e-03
## Row8Col4   -2.160979e-03
## Row8Col5    2.476668e-03
## Row8Col6   -3.828673e-03
## Row8Col7   -1.048006e-02
## Row8Col8   -5.532876e-03
## Row8Col9   -3.738306e-03
## Row8Col10  -3.107509e-03
## Row8Col11   .           
## Row8Col12  -2.015637e-02
## Row9Col1   -6.671154e-03
## Row9Col2   -1.273187e-02
## Row9Col3    .           
## Row9Col4    2.401169e-05
## Row9Col5    2.456011e-03
## Row9Col6   -6.334147e-03
## Row9Col7   -3.411481e-03
## Row9Col8   -3.505407e-03
## Row9Col9    .           
## Row9Col10   4.300997e-04
## Row9Col11   .           
## Row9Col12   .           
## Row10Col1  -3.146215e-02
## Row10Col2   2.024099e-02
## Row10Col3   .           
## Row10Col4  -4.401226e-03
## Row10Col5  -1.417770e-03
## Row10Col6   .           
## Row10Col7  -3.761588e-04
## Row10Col8   .           
## Row10Col9   4.045596e-03
## Row10Col10  1.284812e-03
## Row10Col11  .           
## Row10Col12  2.945700e-02
## Row11Col1   .           
## Row11Col2   1.414574e-02
## Row11Col3   1.159566e-02
## Row11Col4   .           
## Row11Col5   .           
## Row11Col6   1.913894e-04
## Row11Col7   .           
## Row11Col8   .           
## Row11Col9  -4.838328e-03
## Row11Col10 -3.936783e-03
## Row11Col11  7.257718e-03
## Row11Col12  1.614644e-02
## Row12Col1  -2.517017e-02
## Row12Col2   .           
## Row12Col3   1.283688e-02
## Row12Col4   1.575813e-02
## Row12Col5   1.276878e-02
## Row12Col6   1.259224e-02
## Row12Col7   1.038032e-02
## Row12Col8   1.009736e-02
## Row12Col9   1.214080e-02
## Row12Col10  1.603566e-02
## Row12Col11  8.401491e-03
## Row12Col12  1.328244e-02
#correlations
p3_corr <- c()
for (i in 1:length(p3_lasso$beta)){
  if (p3_lasso$beta[i] > 0) {
    p3_corr[i] = 1
  }
  else if (p3_lasso$beta[i] < 0) {
    p3_corr[i] = -1
  }
  else {
    p3_corr[i] = 0
  }
}
cf1 <- colnames(p3_trainX)
names(p3_corr) <- cf1
matrix(nrow = 12, ncol = 12, p3_corr, byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
##  [1,]   -1    1    0    0    1   -1    0    0    0     0    -1    -1
##  [2,]    1    0   -1   -1    0    1    1   -1    1     0     0     0
##  [3,]   -1   -1   -1    0    1    1    1    1    1     1    -1    -1
##  [4,]   -1    0    1    1    1    1    1    1    1     0     1     0
##  [5,]    0    1    1    0   -1    1    1   -1    1     1     0     1
##  [6,]   -1    0    0   -1   -1   -1    1    1    1     0    -1    -1
##  [7,]    0    0   -1   -1   -1   -1   -1   -1   -1     0    -1     0
##  [8,]   -1    0   -1   -1    1   -1   -1   -1   -1    -1     0    -1
##  [9,]   -1   -1    0    1    1   -1   -1   -1    0     1     0     0
## [10,]   -1    1    0   -1   -1    0   -1    0    1     1     0     1
## [11,]    0    1    1    0    0    1    0    0   -1    -1     1     1
## [12,]   -1    0    1    1    1    1    1    1    1     1     1     1

One big difference is that four-nine has much more important pixels for classification than zero-one. This conforms to our intuition that, because it’s harder to tell four and nine apart, we need more features to predict the class. The upper center is concentrated with 1 in a semi- circular shape, signaling the top half of nine. The lower center(row 8) is largely concentrated with -1, signaling the horizontal line in the bottom of four.

Part 2 Solution

p3_train <- data.frame(p3_trainX, p3_trainY)
p3_valid <- data.frame(p3_validX, p3_validY)
# QDA 
FN_QDA_classifier <- qda(p3_trainY ~ ., data=p3_train)
FN_QDA <- predict(FN_QDA_classifier,p3_valid)$class
FN_QDA_oos <- mean(FN_QDA != p3_valid$p3_validY)
FN_QDA_oos #0.06533333
## [1] 0.06533333
fourNine_result$QDA <- FN_QDA_oos
# Naive Bayes classifier: KDE naive bayes
#we use KDE since data was high dimensional and has a lot of obs.
FN_naiveBayes_classifier<-naive_bayes(as.character(p3_train$p3_trainY) ~ ., data=p3_train, usekernel=TRUE)
FN_naiveBayes <- predict(FN_naiveBayes_classifier, p3_valid, type= "class")
FN_naiveBayes_oos <- mean(FN_naiveBayes != p3_valid$p3_validY)
FN_naiveBayes_oos #0.16
## [1] 0.16
fourNine_result$NaiveBayes <- FN_naiveBayes_oos
fourNine_result

We can see that logistic regression has the best performance, while NB has the worst. It makes sense that NaiveBayes does not perform very well, since when one pixel has pencil marks, it’s likely that the pixels around it are marked too, which violates the independence assumption required by NB. QDA relies on the multivariate normal density (MND) assumption where, for each class, the predictors are drawn from a multivariate normal density function. In other words, QDA assumes that the marginal distribution of predictors are normal. With this assumption, QDA models \(\log( \frac{Pr(Y=k | X=x)}{Pr(Y=K | X=x)})\) in a quadratic way in the sense that it allows interaction of pairs of predictors. It makes sense that it outperforms NB due to high correlation between pixels. However, our data does not conform well to QDA’s assumed MND distribution of features, because most of our pixels are either 250 (pencil-marked) or 0 (not marked), rather than distributed on a continuous scale. LR, on the other hand, assumes Bernoulli distribution of the outcome variable, which our data conforms to since our possible outcomes zero and one is represented via a 2-class categorical variable.

Part 3 Solution

We have used Logistic Regression, QDA, and NB w/ KDE for continuous predictors. As explained above, we will not use NB without KDE due to high dimensionality. Since there are only two classes–four and nine, we won’t be using Multinomial logistic regression. We also won’t be using KNN due to the curse of high dimentionality. We’ve seen in the matrix created from the lasso problem that there seems to be a lot of pixels that are quite important in the prediction, so we won’t use DANN either since we don’t expect a huge reduction of dimensions. We also won’t be treating predictors as discrete for NB, since the variables are values on a gray scale which are continuous. For the same reason, we also won’t be using Bayes’ Theorem for discrete features. We will first continue our prediction with the lasso model we built before. Then, we will build new models using LDA, generalized additive LR, linearSVM, polySVM, radialSVM, and tree methods (Bagging, Random Forest, Probability Trees, Boosted Trees). We attempted the GAM for Logistic Regression, but since it’s too computational expensive to try tensor ti() to capture interactions for all combinations, we decided to not continue using GAM for LR. We believe we didn’t loose much info for not trying this method, since we already tried generalized methods of LDA and QDA (which captures interactions between different combinations of two variables).

#lasso
p3_lvalidX <- as.matrix(p3_validX)
FN_lasso <- predict(p3_lasso, p3_lvalidX, type= "class")
FN_lasso_oos <- mean(FN_lasso != p3_validY)
FN_lasso_oos #0.03233333
## [1] 0.03233333
fourNine_result$lasso <- FN_lasso_oos
#LDA
FN_LDA_classifier <- lda(p3_trainY ~ ., data=p3_train)
FN_LDA <- predict(FN_LDA_classifier,p3_validX)$class
FN_LDA_oos <- mean(FN_LDA != p3_validY)
fourNine_result$LDA <- FN_LDA_oos 
#we can see that LDA performs slightly better than QDA, so we believe that LDA is appropriate in the Four-Nine problem, where QDA has a little less precision. This is not surprising because we expected very unclean cuts as Four and Nine look very similar. Hence, instead of the unclean cut provided by QDA, a simpler linear decision boundary in high dimension space might have a better performance, as suggested by LDA.
#SVM data prep
p3_trainYfactored <- factor(p3_trainY)
p3_trainFactored <- data.frame(p3_trainX, p3_trainYfactored)
p3_validYfactored <- factor(p3_validY)
p3_validFactored <- data.frame(p3_validX, p3_validYfactored)

#linearSVM
# to find optimal cost and degree
optimal_cost<-0
optimal_degree<-0
optimal_epe<-1000000
#Create a vector of potential C values. 
#We set our cost in an exponential way so that the cost we try won't be too high that the model overfits, and there are more cost values between 0 and 1 that we would try. This is because we expect the optimal cost value to fall between 0 and 1 for good generalizability, and we want to test carefully which cost value exactly would be the optimal value through CV. This tuning method applies to linearSVM, polySVM, and radialSVM.
cost_vals <- exp(seq(-3,3,length = 30))
for(i in 1:length(cost_vals)){
  #Train the SVM
  svm1 <- ksvm(p3_trainYfactored ~ . , data = p3_trainFactored, type = "C-svc", kernel = "vanilladot", 
                C = cost_vals[i], cross = 5, prob.model = TRUE, kpar = list())
  #Extract the 5fold CV est and store
  temp_p3 <- kernlab::cross(svm1)
  if (temp_p3<optimal_epe){
    optimal_epe<-temp_p3
    optimal_cost<-cost_vals[i]
  }
}
# finalized model
FN_linearSVM_classfier <- ksvm(p3_trainYfactored ~ . , data = p3_trainFactored, type = "C-svc", 
                               kernel = "vanilladot", C = optimal_cost, cross = 5, prob.model = TRUE, 
                               kpar = list())
FN_linearSVM <- predict(FN_linearSVM_classfier, newdata = p3_validFactored , type = "response")
FN_linearSVM_oos<- mean(FN_linearSVM != p3_validYfactored)
fourNine_result$linearSVM <- FN_linearSVM_oos

#polySVM
# to find optimal cost and degree
optimal_cost<-0
optimal_degree<-0
optimal_epe<-1000000
#Create a vector of potential C values
cost_vals <- exp(seq(-3,3,length = 30))
for (j in 2:3){ #conventionally 2nd or 3rd degree polynomial would be enough
  for(i in 1:length(cost_vals)){
    #Train the SVM
    svm2 <- ksvm(p3_trainYfactored ~ . , data = p3_trainFactored, type = "C-svc", kernel = "polydot", 
                 kpar = list(degree = j), C = cost_vals[i], cross = 5, prob.model = TRUE)
    #Extract the 5fold CV est and store
    temp_p3 <- kernlab::cross(svm2)
    if (temp_p3<optimal_epe){
      optimal_epe<-temp_p3
      optimal_cost<-cost_vals[i]
      optimal_degree<-j
    }
  }
}
# finalized model
FN_polySVM_classfier <- ksvm(p3_trainYfactored ~ . , data = p3_trainFactored, type = "C-svc", 
                             kernel = "polydot", kpar = list(degree = optimal_degree), C = optimal_cost,
                             cross = 5, prob.model = TRUE)
FN_polySVM <- predict(FN_polySVM_classfier, newdata = p3_validFactored , type = "response")
FN_polySVM_oos<- mean(FN_polySVM != p3_validYfactored)
fourNine_result$polySVM <- FN_polySVM_oos

#radialSVM
# to find optimal cost
optimal_cost<-0
optimal_degree<-0
optimal_epe<-1000000
#Create a vector of potential C values
cost_vals <- exp(seq(-3,3,length = 30))
for(i in 1:length(cost_vals)){
  #Train the SVM
  svm3 <- ksvm(p3_trainYfactored ~ . , data = p3_trainFactored, type = "C-svc", kernel = "rbfdot", 
                C = cost_vals[i], cross = 5, prob.model = TRUE)
  #Extract the 5fold CV est and store
  temp_p3 <- kernlab::cross(svm3)
  if (temp_p3<optimal_epe){
    optimal_epe<-temp_p3
    optimal_cost<-cost_vals[i]
  }
}
# finalized model
FN_radialSVM_classfier <- ksvm(p3_trainYfactored ~ . , data = p3_trainFactored, type = "C-svc", 
                            kernel = "rbfdot", C = optimal_cost, cross = 5, prob.model = TRUE)
FN_radialSVM <- predict(FN_radialSVM_classfier, newdata = p3_validFactored , type = "response")
FN_radialSVM_oos<- mean(FN_radialSVM != p3_validYfactored)
fourNine_result$radialSVM <- FN_radialSVM_oos
#tree methods(bagging, random forest, probability trees, boosted trees)
#data: y-var will also be factored. so we use same data we used in SVM.

#Bagging
#Since random forest increases variance between trees, it strictly dominates bagging trees. Hence we start off with random forest.

#Random Forest
optimal_varNum<-0
optimal_treeNum<-0
optimal_oob<-100000000
varNum <- c(2:8, 20, 60, 100) #too many var would be like bagging. but considering the high dimentionality of our x_train dataset, we decided to try a couple below 100.
for (i in 1:length(varNum)){
  rfMod<-ranger(p3_trainYfactored ~ . , data = p3_trainFactored, num.trees = 1000, 
                mtry = varNum[i], classification = TRUE)
  temp_oob<-rfMod$prediction.error
  if (temp_oob < optimal_oob){
    optimal_oob<-temp_oob
    optimal_varNum<-i
  }
}
## Growing trees.. Progress: 59%. Estimated remaining time: 21 seconds.
## Growing trees.. Progress: 35%. Estimated remaining time: 57 seconds.
## Growing trees.. Progress: 70%. Estimated remaining time: 26 seconds.
FN_RF_classifier<-ranger(p3_trainYfactored ~ . , data = p3_trainFactored, 
                         num.trees = 1000, mtry = varNum[optimal_varNum], 
                         classification = TRUE)
FN_RF <- predict(FN_RF_classifier, data = p3_validFactored)$predictions
FN_RF_oos<- mean(FN_RF != p3_validFactored$p3_validYfactored)
fourNine_result$RF <- FN_RF_oos
#Probability Trees
optimal_varNum<-0
optimal_treeNum<-0
optimal_oob<-100000000
varNum <- c(2:8, 20, 60, 100)
for (i in 1:length(varNum)){
  probTMod<-ranger(p3_trainYfactored ~ . , data = p3_trainFactored, num.trees = 1000,
                   probability = TRUE, mtry = varNum[i], classification = TRUE)
  temp_oob<-probTMod$prediction.error
  if (temp_oob < optimal_oob){
    optimal_oob<-temp_oob
    optimal_varNum<-i
  }
}
## Growing trees.. Progress: 56%. Estimated remaining time: 24 seconds.
## Growing trees.. Progress: 33%. Estimated remaining time: 1 minute, 3 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 32 seconds.
## Growing trees.. Progress: 99%. Estimated remaining time: 1 seconds.
# finalized model
FN_probT_classifier<-ranger(p3_trainYfactored ~ . , data = p3_trainFactored, 
                            num.trees = 1000, probability = TRUE, 
                            mtry = varNum[optimal_varNum], classification = TRUE)
## Growing trees.. Progress: 57%. Estimated remaining time: 23 seconds.
FN_probT <- predict(FN_probT_classifier, data = p3_validFactored)$predictions
FN_probTree <- c()
for (i in 1:3000){
  if (FN_probT[i,1] > FN_probT[i,2]){
    FN_probTree[i] = 4
  }
  else{
    FN_probTree[i] = 9
  }
}
FN_probTree <- factor(FN_probTree)
FN_probT_oos<- mean(FN_probTree != p3_validFactored$p3_validYfactored)
fourNine_result$ProbabilityTrees <- FN_probT_oos
#Boosted Trees
#Data prep
#p3_trainY_GBM <- p3p1_trainY
#p3_train_GBM <- data.frame(p3_trainX, p3_trainY_GBM)
#p3_validY_GBM <- p3p1_validY
#p3_valid_GBM <- data.frame(p3_validX, p3_validY_GBM)

#ADA
#optimal_treeNum<-0
#optimal_err<-100000000
#treeNum <- seq(500, 3000, 500)
#for (i in 1:length(treeNum)){
  #ADAm <- gbm(p3_trainY_GBM ~ . , data = as.data.frame(p3_train_GBM), 
    #          distribution = "bernoulli", n.trees = treeNum[i], cv.folds = 5)
  #we use validation data to calculate temp_err to prevent overfit
  #temp_err <- mean(round(predict(ADAm, newdata = p3_valid_GBM, n.trees = 60, type = "response")) 
   #                != p3_valid_GBM$p3_validY_GBM)
  #if (temp_err < optimal_err){
   #   optimal_err<-temp_err
    #  optimal_treeNum<-i
  #}
#}
#FN_ADA_classifier <- gbm(p3_trainY_GBM ~ . , data = as.data.frame(p3_train_GBM), 
 #                        distribution = "bernoulli", n.trees = treeNum[optimal_treeNum], cv.folds = 5)
#FN_ADA_oos <- mean(round(predict(FN_ADA_classifier, newdata = p3_valid_GBM, n.trees = 60, 
 #                                type = "response")) != p3_valid_GBM$p3_validY_GBM)
#fourNine_result$ADA <- FN_ADA_oos

#XGB
#xgb_trainY <- p3p1_trainY #coded in {0,1}
#xgb_train <- data.frame(p3_trainX, xgb_trainY)
#xgb_train$xgb_trainY <- NULL
#xgb_validY <- p3p1_validY #coded in {0,1}
#xgb_valid <- data.frame(p3_validX, p3p1_validY)
#xgb_valid$p3p1_validY <- NULL
#depth <- seq(2,20,2)
#learn_rate <- seq(0.1,1,0.1)
#treeNum <- seq(50, 15000, 3000)
#optimal_depth<-0
#optimal_learn_rate<-0
#optimal_treeNum<-0
#optimal_err<-100000000
#waited 8 hours. don't run this!!
#for (i in 1:length(depth)){
 # for (j in 1:length(learn_rate)){
  #  for (k in 1:length(treeNum)){
   #   xgbMod <- xgboost(data = as.matrix(xgb_train), label = xgb_trainY, max.depth = depth[i], 
    #                    eta = learn_rate[j], nrounds = treeNum[k], 
     #                   objective = "binary:logistic", verbose = 0)
      #we use validation data to calculate temp_err to prevent overfit
      #temp_err <- mean(round(predict(xgbMod, newdata = as.matrix(xgb_valid))) != xgb_validY)
      #if (temp_err < optimal_err){
       # optimal_err<-temp_err
        #optimal_treeNum<-k
        #optimal_learn_rate<-j
        #optimal_depth<-i
      #}
    #}
  #}
#}
#depth[optimal_depth] #8
#learn_rate[optimal_learn_rate] #0.2
#treeNum[optimal_treeNum] #3050
#finalized model
#FN_xgb_classifier <- xgboost(data = as.matrix(xgb_train), label = xgb_trainY, 
 #                            max.depth = depth[optimal_depth], eta = learn_rate[optimal_learn_rate], 
  #                           nrounds = treeNum[optimal_treeNum], 
   #                     objective = "binary:logistic", verbose = 0)
#FN_xgb_oos <- mean(round(predict(xgbMod, newdata = as.matrix(xgb_valid))) != xgb_validY)
#fourNine_result$xgb <- FN_xgb_oos
fourNine_result
fourNine_result[which.min(fourNine_result)]

We can see that polySVM performed the best. radialSVM performed worse probably becuase we did not tune the width of the kernal hyperparameter since we didn’t think it’s computationally beneficial. SVM performed better than trees since tree methods still have a weak assumption of the starting search point. In our case, our tree methods have found a local minimum, but may not have found the global minimum, making trees perform worse than the truly assumption-free SVMs. For qda, lda, NB, lasso, and LR, they all performed worse because their assumptions about distributions, about features or outcomes, or about shape of decision boundaries were not properly fulfilled by our dataset. Therefore, they performed worse than our assumption-free SVM. While we tried boosted trees, we may have done a bad job at tuning the hyperparameters, resulting in high oos EPE for boosted trees. We commented out the boosted tree methods since they took very long to run.

for (k in 1:length(FN_polySVM)) {
  if (FN_polySVM[k] != p3_validYfactored[k]){
    print(k) #misclassified: 371, 1240, 2025, 2734
  }
}
## [1] 371
## [1] 392
## [1] 401
## [1] 432
## [1] 521
## [1] 797
## [1] 896
## [1] 910
## [1] 1097
## [1] 1196
## [1] 1240
## [1] 1291
## [1] 1353
## [1] 1361
## [1] 1375
## [1] 1392
## [1] 1511
## [1] 2019
## [1] 2025
## [1] 2135
## [1] 2314
## [1] 2331
## [1] 2362
## [1] 2457
## [1] 2512
## [1] 2561
## [1] 2596
## [1] 2622
## [1] 2623
## [1] 2678
## [1] 2734
## [1] 2792
## [1] 2863
p3_validYfactored[371] #4
## [1] 4
## Levels: 4 9
plot_digit(x = p3_validX[371,], bw = FALSE, main = "True Class = 4")

p3_validYfactored[1240] #4
## [1] 4
## Levels: 4 9
plot_digit(x = p3_validX[1240,], bw = FALSE, main = "True Class = 4")

p3_validYfactored[2025] #9
## [1] 9
## Levels: 4 9
plot_digit(x = p3_validX[2025,], bw = FALSE, main = "True Class = 9")

p3_validYfactored[2734] #9
## [1] 9
## Levels: 4 9
plot_digit(x = p3_validX[2734,], bw = FALSE, main = "True Class = 9")

It makes sense that the above 4 images are misclassified–each digit does look like it belongs to the opposite class. However, I think images 1 and 4 may have a chance of being classified correctly, if those two images are oriented more towards the center line and thus less slanted.

Part 4 Solution

test_y$label <- predict(FN_polySVM_classfier, newdata=test_x)
write.csv(as.matrix(test_y), file='Q3Predictions.csv', row.names=FALSE)

Question 4 (25 pts.)

Question 4 Data

p4_trainX <- train_x[c(7501:10000, 12501:15000, 20001:22500),]
p4_trainY <- train_y[c(7501:10000, 12501:15000, 20001:22500),]
p4_validX <- validation_x[c(4501:6000, 7501:9000, 12001:13500),]
p4_validY <- validation_y[c(4501:6000, 7501:9000, 12001:13500),]

Part 1 Solution

From the observation of shoe pictures, we found that a commonality is that the 12th column has dark marks. The 3s, 5s, and 8s, however, are not likely to have the whole or most of 12th column filled. So we picked three points at the 12th column after transforming all observations to 12x12 matrices and tuned a grayscale value to find as many shoes as possible.

shoes <- c()
for (i in 1:7500){
  p4_trainMatrix <- matrix(nrow = 12, ncol = 12, p4_trainX[i,], byrow = TRUE)
  if (p4_trainMatrix[6,12] > 5 && p4_trainMatrix[4,12] > 5 && p4_trainMatrix[8,12] > 5) {
    shoes <- append(shoes, i)
  }
}
for (i in 1:length(shoes)){
  plot_digit(x = p4_trainX[shoes[i],], bw = FALSE)
}

We then matched those pictures with the ones given and identified the remaining 2 shoes to be found. For one shoe, both its upper-right corner and lower-left corner seem to be marked, which should be rare for digits.

obs <- c()
for (i in 1:7500){
  p4_trainMatrix <- matrix(nrow = 12, ncol = 12, p4_trainX[i,], byrow = TRUE)
  if (p4_trainMatrix[1,12] > 100 && p4_trainMatrix[12,1] > 100) {
    obs <- append(obs, i)
  }
}
for (i in 1:length(obs)){
  plot_digit(x = p4_trainX[obs[i],], bw = FALSE)
}

obs
## [1] 1244 2575
shoes <- append(1244, shoes)

Now we only have one shoe left to find. We observe that its middle-left and two points on the right are marked. We reasoned that those three points may be more likely to be filled for digits too, so we start by setting a high grey-scale value.

obs <- c()
for (i in 1:7500){
  p4_trainMatrix <- matrix(nrow = 12, ncol = 12, p4_trainX[i,], byrow = TRUE)
  if (p4_trainMatrix[6,1] > 50 && p4_trainMatrix[4,12] > 50 && p4_trainMatrix[7,12] > 50) {
    obs <- append(obs, i)
  }
}
for (i in 1:length(obs)){
  plot_digit(x = p4_trainX[obs[i],], bw = FALSE)
}

obs
## [1]  720 2190
shoes <- append(2190, shoes)
for (i in 1:length(shoes)){
  plot_digit(x = p4_trainX[shoes[i],], bw = FALSE)
}

Question 5 (20 pts.)

Part 1 Solution

Since there are 10 classes, logistic regression (which is suited for 2 classes) doesn’t really work. Instead, we will start with multinomial logistic regression. Then we will try LASSO. For the same reason for the fourNine problem, we are not going to use GAM, Bayes for discrete features, NB with discrete features. And for the same reason of high dimentionality, we won’t be using NB w/o KDE or KNN. We will be using MLR, LASSO, LDA, QDA, NB w/ continuous variable w/ KDE, 2 SVM methods (we are not using linear SVM for the same reason as fourNine problem), and tree methods. We won’t be using bagging since RF is likely to outperfom bagging. Also, to save computation time, we don’t tune num.trees since from our previous experiences, the change in num.trees usually don’t change oos EPE too much.

start.time <- Sys.time()
#Factored Data
train_factored <- train_x
train_factored$train_yFactored <- factor(train_y$label)
valid_factored <- validation_x
valid_factored$valid_yFactored <- factor(validation_y$label)

#MLR
multiLogistic_classifier <- nnet::multinom(train_yFactored ~., data = train_factored, 
                                     trace = FALSE, MaxNWts = 80000)
multiLog_pred<-predict(multiLogistic_classifier, newdata =valid_factored)
multiLog_oos<- mean(multiLog_pred != valid_factored$test_yFactored)
result <- data.frame("Multinomial LR" = multiLog_oos)


#lasso
lasso_TrainX <- data.matrix(train_x)
lasso_TrainY <- data.matrix(train_y)
lasso_validX <- as.matrix(validation_x)
cv_lasso <- cv.glmnet(x=lasso_TrainX, y=lasso_TrainY, family="multinomial",
                     control = list(maxit = 100), alpha=1, nfolds=5,
                          type.measure="mse")
lasso_classifier <- glmnet(x=lasso_TrainX, y=lasso_TrainY, family="multinomial", alpha=1,
                            lambda=cv_lasso$lambda.min)
lasso <- predict(lasso_classifier, lasso_validX, type= "class")
lasso_oos <- mean(lasso != validation_y)
result$lasso <- lasso_oos


#data prep
train <- data.frame(train_x, "train_y" = train_y$label)
#LDA
LDA_classifier <- lda(train_y ~ ., data=train)
LDA <- predict(LDA_classifier,validation_x)$class
LDA_oos <- mean(LDA != validation_y$label)
result$LDA <- LDA_oos 
#QDA
QDA_classifier <- qda(train_y ~ ., data=train)
QDA <- predict(QDA_classifier,validation_x)$class
QDA_oos <- mean(QDA != validation_y$label)
result$QDA <- QDA_oos
# Naive Bayes classifier: KDE naive bayes
#we use KDE since data was high dimensional and has a lot of obs.
naiveBayes_classifier<-naive_bayes(as.character(train$train_y) ~ ., data=train, usekernel=TRUE)
naiveBayes <- predict(naiveBayes_classifier, validation_x, type= "class")
naiveBayes_oos <- mean(naiveBayes != validation_y$label)
result$NaiveBayes <- naiveBayes_oos



#SVM data prep
#subsetted data
set.seed(456)
sam_train_factored <- train_factored %>% group_by(train_yFactored) %>% slice_sample(n=500)
#sam_train_factored$train_yFactored
#valid_factored
#valid_factored$test_yFactored

#polySVM
# to find optimal cost and degree
optimal_cost<-0
optimal_degree<-0
optimal_epe<-1000000
cost_vals <- exp(seq(-3,3,length = 30))
#We set our cost in an exponential way so that the cost we try won't be too high that the model overfits, and there are more cost values between 0 and 1 that we would try. This is because we expect the optimal cost value to fall between 0 and 1 for good generalizability, and we want to test carefully which cost value exactly would be the optimal value through CV. This tuning method applies to polySVM and radialSVM.
for (j in 2:3){ #conventionally 2nd or 3rd degree polynomial would be enough
  for(i in 1:length(cost_vals)){
    #Train the SVM
    svm2 <- ksvm(train_yFactored ~ . , data = sam_train_factored, type = "C-svc", kernel = "polydot", 
                 kpar = list(degree = j), C = cost_vals[i], cross = 5, prob.model = TRUE)
    #Extract the 5fold CV est and store
    temp <- kernlab::cross(svm2)
    if (temp<optimal_epe){
      optimal_epe<-temp
      optimal_cost<-cost_vals[i]
      optimal_degree<-j
    }
  }
}
# finalized model
polySVM_classfier <- ksvm(train_yFactored ~ . , data = sam_train_factored, type = "C-svc", 
                             kernel = "polydot", kpar = list(degree = optimal_degree), C = optimal_cost,
                             cross = 5, prob.model = TRUE)
polySVM <- predict(polySVM_classfier, newdata = valid_factored, type = "response")
polySVM_oos<- mean(polySVM != valid_factored$valid_yFactored)
result$polySVM <- polySVM_oos

#radialSVM
# to find optimal cost
optimal_cost<-0
optimal_degree<-0
optimal_epe<-1000000
#Create a vector of potential C values
cost_vals <- exp(seq(-3,3,length = 30))
for(i in 1:length(cost_vals)){
  #Train the SVM
  svm3 <- ksvm(train_yFactored ~ . , data = sam_train_factored, type = "C-svc", kernel = "rbfdot", 
                C = cost_vals[i], cross = 5, prob.model = TRUE)
  #Extract the 5fold CV est and store
  temp <- kernlab::cross(svm3)
  if (temp<optimal_epe){
    optimal_epe<-temp
    optimal_cost<-cost_vals[i]
  }
}
# finalized model
radialSVM_classfier <- ksvm(train_yFactored ~ . , data = sam_train_factored, type = "C-svc", 
                            kernel = "rbfdot", C = optimal_cost, cross = 5, prob.model = TRUE)
radialSVM <- predict(radialSVM_classfier, newdata = valid_factored, type = "response")
radialSVM_oos<- mean(radialSVM != valid_factored$valid_yFactored)
result$radialSVM <- radialSVM_oos



#tree methods(bagging, random forest, probability trees, boosted trees)
#data: y-var will also be factored.
#train_factored
#train_factored$train_yFactored
#valid_factored
#valid_factored$test_yFactored

#Random Forest
optimal_varNum<-0
optimal_treeNum<-0
optimal_oob<-100000000
varNum <- c(2,8,20) #too many var would be like bagging. but considering the high dimentionality of our x_train dataset, we decided to try a couple below 20.
for (i in 1:length(varNum)){
  rfMod<-ranger(train_yFactored ~ . , data = train_factored, num.trees = 1000, 
                mtry = varNum[i], classification = TRUE)
  temp_oob<-rfMod$prediction.error
  if (temp_oob < optimal_oob){
    optimal_oob<-temp_oob
    optimal_varNum<-i
  }
}
## Growing trees.. Progress: 63%. Estimated remaining time: 18 seconds.
## Growing trees.. Progress: 18%. Estimated remaining time: 2 minutes, 17 seconds.
## Growing trees.. Progress: 36%. Estimated remaining time: 1 minute, 48 seconds.
## Growing trees.. Progress: 54%. Estimated remaining time: 1 minute, 18 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 48 seconds.
## Growing trees.. Progress: 90%. Estimated remaining time: 17 seconds.
## Growing trees.. Progress: 7%. Estimated remaining time: 6 minutes, 39 seconds.
## Growing trees.. Progress: 14%. Estimated remaining time: 6 minutes, 20 seconds.
## Growing trees.. Progress: 21%. Estimated remaining time: 5 minutes, 49 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 5 minutes, 19 seconds.
## Growing trees.. Progress: 35%. Estimated remaining time: 4 minutes, 47 seconds.
## Growing trees.. Progress: 42%. Estimated remaining time: 4 minutes, 14 seconds.
## Growing trees.. Progress: 50%. Estimated remaining time: 3 minutes, 42 seconds.
## Growing trees.. Progress: 57%. Estimated remaining time: 3 minutes, 10 seconds.
## Growing trees.. Progress: 64%. Estimated remaining time: 2 minutes, 38 seconds.
## Growing trees.. Progress: 71%. Estimated remaining time: 2 minutes, 6 seconds.
## Growing trees.. Progress: 78%. Estimated remaining time: 1 minute, 35 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 1 minute, 3 seconds.
## Growing trees.. Progress: 93%. Estimated remaining time: 32 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
RF_classifier<-ranger(train_yFactored ~ . , data = train_factored, num.trees = 1000, 
                      mtry = varNum[optimal_varNum], classification = TRUE)
## Growing trees.. Progress: 7%. Estimated remaining time: 6 minutes, 39 seconds.
## Growing trees.. Progress: 14%. Estimated remaining time: 6 minutes, 11 seconds.
## Growing trees.. Progress: 22%. Estimated remaining time: 5 minutes, 41 seconds.
## Growing trees.. Progress: 29%. Estimated remaining time: 5 minutes, 9 seconds.
## Growing trees.. Progress: 36%. Estimated remaining time: 4 minutes, 36 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 4 minutes, 4 seconds.
## Growing trees.. Progress: 51%. Estimated remaining time: 3 minutes, 35 seconds.
## Growing trees.. Progress: 58%. Estimated remaining time: 3 minutes, 4 seconds.
## Growing trees.. Progress: 65%. Estimated remaining time: 2 minutes, 32 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 0 seconds.
## Growing trees.. Progress: 80%. Estimated remaining time: 1 minute, 28 seconds.
## Growing trees.. Progress: 87%. Estimated remaining time: 57 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 26 seconds.
RF <- predict(RF_classifier, data = valid_factored)$predictions
RF_oos<- mean(RF != valid_factored$valid_yFactored)
result$RF <- RF_oos

#Probability Trees
optimal_varNum<-0
optimal_treeNum<-0
optimal_oob<-100000000
varNum <- c(2,8,20)
for (i in 1:length(varNum)){
  probTMod<-ranger(train_yFactored ~ . , data = train_factored, num.trees = 1000, 
                  probability = TRUE, mtry = varNum[i], classification = TRUE)
  temp_oob<-probTMod$prediction.error
  if (temp_oob < optimal_oob){
    optimal_oob<-temp_oob
    optimal_varNum<-i
  }
}
## Growing trees.. Progress: 64%. Estimated remaining time: 17 seconds.
## Growing trees.. Progress: 18%. Estimated remaining time: 2 minutes, 17 seconds.
## Growing trees.. Progress: 37%. Estimated remaining time: 1 minute, 47 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 1 minute, 16 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 45 seconds.
## Growing trees.. Progress: 91%. Estimated remaining time: 14 seconds.
## Growing trees.. Progress: 7%. Estimated remaining time: 6 minutes, 46 seconds.
## Growing trees.. Progress: 15%. Estimated remaining time: 6 minutes, 5 seconds.
## Growing trees.. Progress: 22%. Estimated remaining time: 5 minutes, 35 seconds.
## Growing trees.. Progress: 29%. Estimated remaining time: 5 minutes, 4 seconds.
## Growing trees.. Progress: 36%. Estimated remaining time: 4 minutes, 34 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 4 minutes, 1 seconds.
## Growing trees.. Progress: 51%. Estimated remaining time: 3 minutes, 32 seconds.
## Growing trees.. Progress: 58%. Estimated remaining time: 3 minutes, 1 seconds.
## Growing trees.. Progress: 65%. Estimated remaining time: 2 minutes, 31 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 0 seconds.
## Growing trees.. Progress: 80%. Estimated remaining time: 1 minute, 28 seconds.
## Growing trees.. Progress: 87%. Estimated remaining time: 57 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 25 seconds.
# finalized model
probT_classifier<-ranger(train_yFactored ~ . , data = train_factored, num.trees = 1000,
                          probability = TRUE, mtry = varNum[optimal_varNum], classification = TRUE)
## Growing trees.. Progress: 7%. Estimated remaining time: 6 minutes, 45 seconds.
## Growing trees.. Progress: 14%. Estimated remaining time: 6 minutes, 17 seconds.
## Growing trees.. Progress: 21%. Estimated remaining time: 5 minutes, 43 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 5 minutes, 11 seconds.
## Growing trees.. Progress: 36%. Estimated remaining time: 4 minutes, 39 seconds.
## Growing trees.. Progress: 43%. Estimated remaining time: 4 minutes, 9 seconds.
## Growing trees.. Progress: 50%. Estimated remaining time: 3 minutes, 38 seconds.
## Growing trees.. Progress: 57%. Estimated remaining time: 3 minutes, 7 seconds.
## Growing trees.. Progress: 64%. Estimated remaining time: 2 minutes, 35 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 4 seconds.
## Growing trees.. Progress: 79%. Estimated remaining time: 1 minute, 32 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 1 minute, 1 seconds.
## Growing trees.. Progress: 93%. Estimated remaining time: 29 seconds.
probT <- predict(probT_classifier, data = valid_factored)$predictions
probTree <- c()
for (i in 1:15000){
  probTree[i] <- which.max(probT[i,]) -1
}
probTree <- factor(probTree)
probT_oos<- mean(probTree != valid_factored$valid_yFactored)
result$ProbabilityTrees <- probT_oos
end.time <- Sys.time()

#computation time
time.taken <- end.time - start.time
result$time.taken <- time.taken

As we can see, polySVM has the lowest oos EPE, as expected due to reasons explained in the fourNine problem. So we decided to go with polySVM as our final model

polySVM_classfier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 2.06295192635843 
## 
## Polynomial kernel function. 
##  Hyperparameters : degree =  3  scale =  1  offset =  1 
## 
## Number of Support Vectors : 2517 
## 
## Objective Function Value : 0 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -1e-04 -2e-04 -1e-04 -1e-04 -2e-04 -1e-04 -1e-04 -2e-04 -2e-04 -2e-04 -1e-04 -2e-04 -2e-04 -4e-04 -1e-04 -1e-04 -2e-04 -2e-04 -1e-04 -1e-04 -1e-04 -1e-04 -3e-04 -2e-04 
## Training error : 0 
## Cross validation error : 0.0598 
## Probability model included.

Part 2 Solution

pred_class_countRate<- data.frame(expand.grid(0:9, 0,0,0,0,0,0,0,0,0,0),
                                  stringsAsFactors = F)
names(pred_class_countRate)=c("trueValueBelow","0","1","2","3","4","5","6","7","8","9")

predYClass <- predict(polySVM_classfier, newdata = valid_factored, type = "response")

check1<-FALSE
check2<-FALSE

for (i in 1: length(predYClass)){
  row_num<-as.numeric(valid_factored$valid_yFactored[i])
  col_num<-as.numeric(predYClass[i])+1
  if (check1== FALSE & row_num == 8 & col_num== 11 ){# pull out example for next part
      sample79<- i
      check1<-TRUE
  }
  if (check2==FALSE & row_num ==10 & col_num==6){# pull out example for next part
    sample49<-i
    check2<-TRUE
  }
  pred_class_countRate[row_num,col_num]<-pred_class_countRate[row_num,col_num]+1
}

4&9(sample49) and 7&9(sample79) are the most often misclassified pairs The following are two example we pull out from the code above.

potential aspects of misclassification: For 7-9 misclassification, the hand-written 7 is a bit too curve-in on its left end. For 4-9 misclassification, the location of the “circle” is a bit confusing for the model.[maybe the model consider circle part on the right side as a sign of 4-ness]

potential steps to improve in data cleaning stage: 1) improve on the digit-orientation algorithm 2) don’t reduce the pixel to 12*12 and sacrifice some run-time for the quality

plot_digit(x = validation_x[sample49,], bw = FALSE, main = "True Class = 9")

plot_digit(x = validation_x[sample79,], bw = FALSE, main = "True Class = 7")

Part 3 Solution

test_y$label <- predict(polySVM_classfier, newdata=test_x, type = "response")
write.csv(as.matrix(test_y), file='Q5Predictions.csv', row.names=FALSE)